from IPython.display import HTML
HTML('''<script>
code_show=true;
function code_toggle() {
if (code_show){
$('div.input').hide();
} else {
$('div.input').show();
}
code_show = !code_show
}
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')
import numpy as np
import sklearn.metrics as metrics
from inspect import signature
import seaborn as sns
import shap
from sklearn import preprocessing
import matplotlib.pyplot as plt
shap.initjs()
import pandas as pd
figuresize = (8,6)
%matplotlib inline
import os
# Get the data
all_testy_hat = np.load('all_testy_hat.npy')
all_testys = np.load('all_testys.npy')
all_trainy_hat = np.load('all_trainy_hat.npy')
all_trainys = np.load('all_trainys.npy')
all_validy_hat = np.load('all_validy_hat.npy')
all_validys = np.load('all_validys.npy')
cutpoint_avg = np.load('cutpoint_avg.npy')
test_evalmetrics = np.load('test_evalmetrics.npy')
threshold_avg = np.load('threshold_avg.npy')
train_evalmetrics = np.load('train_evalmetrics.npy')
valid_evalmetrics = np.load('valid_evalmetrics.npy')
all_trainy_hat_1 = all_trainy_hat[[i==1.0 for i in all_trainys]]
all_trainy_hat_0 = all_trainy_hat[[i==0.0 for i in all_trainys]]
all_train_percentile_1 = np.percentile(all_trainy_hat_1, [0,10,20,30,40,50,60,70,80,90,100])
all_train_percentile_0 = np.percentile(all_trainy_hat_0, [0,10,20,30,40,50,60,70,80,90,100])
all_testy_hat_1 = all_testy_hat[[i==1.0 for i in all_testys]]
all_testy_hat_0 = all_testy_hat[[i==0.0 for i in all_testys]]
all_test_percentile_1 = np.percentile(all_testy_hat_1, [0,10,20,30,40,50,60,70,80,90,100])
all_test_percentile_0 = np.percentile(all_testy_hat_0, [0,10,20,30,40,50,60,70,80,90,100])
y_true = all_testys # true labels
y_probas = all_testy_hat # predicted results
fpr, tpr, thresholds = metrics.roc_curve(y_true, y_probas, pos_label=1)
auc = np.trapz(tpr,fpr)
# Print ROC curve
plt.figure(figsize=figuresize)
plt.title('ROC curve with AUC = {}'.format(auc))
plt.plot(fpr,tpr)
plt.show()
# Print AUC
print('AUC:', auc)
precision, recall, _ = metrics.precision_recall_curve(y_true, y_probas)
average_precision = metrics.average_precision_score(y_true, y_probas)
plt.figure(figsize=figuresize)
step_kwargs = ({'step': 'post'}
if 'step' in signature(plt.fill_between).parameters
else {})
plt.step(recall, precision, color='b', alpha=0.2,
where='post')
plt.fill_between(recall, precision, alpha=0.2, color='b', **step_kwargs)
plt.xlabel('Recall')
plt.ylabel('Precision')
#plt.ylim([0.0, .05])
#plt.xlim([0.0, 1.0])
plt.title('2-class Precision-Recall curve: AP={0:0.2f}'.format(average_precision))
import tensorflow as tf
import logging
#logger = logging.getLogger()
#logger.setLevel(logging.DEBUG)
#logging.debug("test")
import training
from config import args, model_hps, train_hps, input_pipeline_hps, dir_hps #, Colnames, REDIS_PWD, REDIS_HOSTS_SETS
from utils import output2csv #, transfer2redis
from config import IAB_NUM, LABEL_NUM_1,CLIENT_LABEL_NUM, ADCATEGORIES_NUM, INT_FEATURES_LIST, FLOAT_FEATURES_LIST
model_hps
train_hps
dir_hps
input_pipeline_hps.batch_size = 256
DATASET_SIZE = 0
for fn in input_pipeline_hps.data_file:
for _ in tf.python_io.tf_record_iterator(fn):
DATASET_SIZE += 1
logging.info('TOTAL DATASET_SIZE = %d', DATASET_SIZE)
DATASET_SIZE
def extract_tfrecords(data_record):
features = {}
for int_feature in INT_FEATURES_LIST:
features[int_feature] = tf.FixedLenFeature([1], tf.int64)
for float_feature in FLOAT_FEATURES_LIST:
features[float_feature] = tf.FixedLenFeature([1], tf.float32)
features['var1'] = tf.VarLenFeature(tf.int64)
features['var2'] = tf.VarLenFeature(tf.int64)
features['var3'] = tf.FixedLenFeature([1, LABEL_NUM_1], tf.int64)
features['var4'] = tf.FixedLenFeature([1, LABEL_NUM_2], tf.int64)
sample = tf.parse_single_example(data_record, features)
tf.reshape(sample['var3'],[tf.shape(sample['var3'])[0],-1])
sample["var1"] = tf.sparse_tensor_to_dense(sample["var1"], default_value=0)
sample["var1"] = tf.reduce_sum(tf.one_hot(sample["var1"], depth=LABEL_NUM_3),axis=0)
sample["var2"] = tf.sparse_tensor_to_dense(sample["var2"], default_value=0)
sample["var2"] = tf.reduce_sum(tf.one_hot(sample["var2"], depth=LABEL_NUM_4),axis=0)
sample["var5"] = tf.cast(sample["var5"], tf.int32)
y = sample["label"]
return (sample, y)
full_dataset = tf.data.TFRecordDataset(input_pipeline_hps.data_file)
if input_pipeline_hps.is_test:
test_ratio = 1.0 - input_pipeline_hps.train_ratio - input_pipeline_hps.valid_ratio
train_size = int(input_pipeline_hps.train_ratio * DATASET_SIZE)
valid_size = int(input_pipeline_hps.valid_ratio * DATASET_SIZE)
test_size = int(test_ratio * DATASET_SIZE)
logging.info('train_size: %d valid_size: %d test_size: %d', train_size, valid_size, test_size)
full_dataset = full_dataset.shuffle(buffer_size=DATASET_SIZE)
full_dataset = full_dataset.map(extract_tfrecords,
num_parallel_calls=input_pipeline_hps.num_cores)
train_dataset = full_dataset.take(train_size)
test_dataset = full_dataset.skip(train_size)
valid_dataset = test_dataset.skip(valid_size)
test_dataset = test_dataset.take(test_size)
else:
train_size = int(input_pipeline_hps.train_ratio * DATASET_SIZE)
valid_size = DATASET_SIZE - train_size
logging.info('train_size: %d valid_size: %d', train_size, valid_size)
full_dataset = full_dataset.shuffle(buffer_size=DATASET_SIZE)
full_dataset = full_dataset.map(extract_tfrecords, num_parallel_calls=input_pipeline_hps.num_cores)
train_dataset = full_dataset.take(train_size)
valid_dataset = full_dataset.skip(train_size)
sess = tf.InteractiveSession()
train_dataset = train_dataset.repeat()
train_dataset = train_dataset.batch(input_pipeline_hps.batch_size) #train_size)
train_dataset = train_dataset.prefetch(input_pipeline_hps.prefetch_size)
train_iterator = tf.data.Iterator.from_structure(train_dataset.output_types, train_dataset.output_shapes)
train_init_op = train_iterator.make_initializer(train_dataset)
train_next_batch = train_iterator.get_next()
sess.run(train_init_op)
test_dataset = test_dataset.repeat()
test_dataset = test_dataset.batch(input_pipeline_hps.batch_size) #test_size
test_dataset = test_dataset.prefetch(input_pipeline_hps.prefetch_size)
test_iterator = tf.data.Iterator.from_structure(test_dataset.output_types, test_dataset.output_shapes)
test_init_op = test_iterator.make_initializer(test_dataset)
test_next_batch = test_iterator.get_next()
sess.run(test_init_op)
train_data = sess.run(train_next_batch)
test_data = sess.run(test_next_batch)
np.shape(train_data[0]['lab1']), np.shape(train_data[0]['lab2']), np.shape(train_data[0]['lab3'])
features = #not shown
len(features)
import numpy as np
def preprocessor(train_data_dict):
train_data_batch = np.hstack((train_data_dict[0][features[0]],train_data_dict[0][features[1]]))
for feature in features[2:]:
train_data_batch = np.hstack((train_data_batch,train_data_dict[0][feature]))
train_data_batch = np.hstack((train_data_batch,np.reshape(train_data_dict[0]['lab1'], (input_pipeline_hps.batch_size, LABEL_NUM_1))))
train_data_batch = np.hstack((train_data_batch,np.reshape(train_data_dict[0]['lab2'], (input_pipeline_hps.batch_size, LABEL_NUM_2))))
train_data_batch = np.hstack((train_data_batch,np.reshape(train_data_dict[0]['lab3'], (input_pipeline_hps.batch_size, LABEL_NUM_3))))
return train_data_batch
def data_to_df(data):
cols = features
cols = cols + ['lab1_' + str(i) for i in range(LABEL_NUM_1)]
cols = cols + ['lab2_' + str(i) for i in range(LABEL_NUM_2)]
cols = cols + ['lab3_' + str(i) for i in range(LABEL_NUM_3)]
data_df = pd.DataFrame(data)
data_df.columns = cols
return data_df
class DNNPredictor(object):
def __init__(self, model_hps,input_pipeline_hps,dir_hps):
tf.reset_default_graph()
self.model_hps = model_hps
self.input_pipeline_hps = input_pipeline_hps
self.dir_hps = dir_hps
self.sess = tf.Session()
self.build_model()
def build_model(self):
from model.dnn import DNN
self._model = DNN(self.input_pipeline_hps,self.model_hps)
self._model.core_builder()
self.sess.run(tf.global_variables_initializer())
saver = tf.train.Saver(max_to_keep=10)
if dir_hps.save_dir:
model_file = tf.train.latest_checkpoint(self.dir_hps.save_dir + self.dir_hps.load_dir)
if model_file:
logging.info('Restoring from: %s', model_file)
saver.restore(self.sess, model_file)
def predict(self, instances):
num_rows = len(instances)
model_feed_dict = {
self._model.phase: 0,
self._model.features_dict['var1'][2]: instances[:,0].reshape(num_rows, 1),
self._model.features_dict['var2'][2]: instances[:,1].reshape(num_rows, 1),
self._model.features_dict['var3'][2]: instances[:,2].reshape(num_rows, 1),
self._model.features_dict['var3'][2]: instances[:,3].reshape(num_rows, 1),
self._model.features_dict['var4'][2]: instances[:,4].reshape(num_rows, 1LABEL_NUM_1
self._model.features_dict['var5'][2]: instances[:,5].reshape(num_rows, 1),
self._model.features_dict['var5'][2]: instances[:,6].reshape(num_rows, 1),
self._model.var6: instances[:,7].reshape(num_rows, 1),
self._model.var7: instances[:,8].reshape(num_rows, 1),
self._model.var8: instances[:,9].reshape(num_rows, 1),
self._model.var9: instances[:,10].reshape(num_rows, 1),
self._model.var10:instances[:,11].reshape(num_rows, 1),
self._model.var11:instances[:,12].reshape(num_rows, 1),
self._model.var12:instances[:,13].reshape(num_rows, 1),
self._model.lab1:instances[:,14:14+LABEL_NUM_1].reshape(num_rows, LABEL_NUM_1),
self._model.lab2:instances[:,14+LABEL_NUM_1:14+LABEL_NUM_1+LABEL_NUM_2].reshape(num_rows, LABEL_NUM_2),
self._model.lab3:instances[:,14+LABEL_NUM_1+LABEL_NUM_2:14+LABEL_NUM_1+LABEL_NUM_2+LABEL_NUM_3].reshape(num_rows, LABEL_NUM_3),
}
outputs = self.sess.run(self._model.y_prob,feed_dict = model_feed_dict)
return outputs
dnnobj = DNNPredictor(model_hps,input_pipeline_hps,dir_hps)
train_data = sess.run(train_next_batch)
test_data = sess.run(test_next_batch)
train_data_batch = preprocessor(train_data)
test_data_batch = preprocessor(test_data)
train_data = data_to_df(train_data_batch)
test_data = data_to_df(test_data_batch)
data_summary = shap.kmeans(train_data, 25)
explainer = shap.KernelExplainer(dnnobj.predict, data_summary, link="logit")
data = test_data
ith_obs = 249
shap.force_plot(explainer.expected_value[0], shap_values[0][ith_obs,:],data.iloc[ith_obs,:],link="logit")
NUM_ROWS = input_pipeline_hps.batch_size
shap.force_plot(explainer.expected_value, shap_values[0], data.iloc[0:NUM_ROWS,:], link="logit")